#load required R libraries
library(nhanesA)
library(forcats)
library(boot)
library(ggplot2)
library(ggpubr)
library(kableExtra)
library(janitor)
library(naniar)
library(psych)
library(patchwork)
library(gridExtra)
library(mice)
library(broom.mixed)
library(Epi)
library(vcd)
library(xfun)
library(tidyverse)
theme_set(theme_light())
knitr::opts_chunk$set(comment=NA)Predicting Health Outcomes Using NHANES Data: Insights from Multivariable Analyses
1. Setup and Data Ingest
1.1. Loading R packages
2. Cleaning the Data
2.1. Identify a Quantitative outcome
The qualitative outcome of this analysis is Body Mass Index (BMI). BMI is often used as an indicator for assessing body fat based on height and weight and is also quiet associated with an individual’s quality of life.BMI data to be used for this analysis was corrected from participants from the 2017-March 2020 NHANES survey and was recorded as BMXBMI. ## 2.2. Identify key predictor and three other predictors of outcome
The key predictor for this analysis is Gender. Quite often gender physiological differences between people of different sex prevail and these may result in differences in BMI due to biological, hormonal, and societal factors influencing body composition, metabolism, lifestyle behaviors among others. Understanding how BMI varies with gender within a population and how this relationship may be affected by other cofounding factors including age, smoking status, and race may provide us with insights on how this association may influence the gender based health outcomes within the United States population.
2.3. Data preparation
Data is obtained from the 2017-March 2020 survey data through the nhanesA package, This package allows you to access data from demographics data, experimental data, Questionnaires among others.How we gather all the data used on the analysis is shown in the code section below.
analysis2Data <- readRDS('data/Analyses2Data.rds')
# Check for missing values
missing_summary <- colSums(is.na(analysis2Data))
missing_summary SEQN Age Gender Race Smoking BMI
0 0 0 380 0 0
# Imputation using `mice`
imputed_data <- mice(analysis2Data, m = 1, method = 'pmm', maxit = 5)
iter imp variable
1 1 Race
2 1 Race
3 1 Race
4 1 Race
5 1 Race
complete_data <- complete(imputed_data)
# Display summary
kable(head(complete_data)) %>%
kable_styling(bootstrap_options = "striped", full_width = F)| SEQN | Age | Gender | Race | Smoking | BMI |
|---|---|---|---|---|---|
| 109266 | 29 | Female | Asian | Non_Smoker | 37.8 |
| 109271 | 49 | Male | White | Smoker | 29.7 |
| 109273 | 36 | Male | White | Smoker | 21.9 |
| 109274 | 68 | Male | Asian | Non_Smoker | 30.2 |
| 109282 | 76 | Male | White | Smoker | 26.6 |
| 109284 | 44 | Female | Mexican | Non_Smoker | 39.1 |
3. Codebook and Data Description
variable_description <- tibble::tribble(
~Variable_Name, ~Role_in_Analysis,~Type, ~Original_Code,
"SEQN", "ID", "Quant", " Respondent sequence number",
"RIDSTATR","Interview/Examination status","Binary","RIDSTATR",
"Gender", "Analysis Key Predictor", "Binary", "RIAGENDR",
"Age", "Analysis key predictor and used to select Adult persons aged 21-79 years","Quant","RIDAGEYR",
"Race/Ethnicity", "Analysis Predictor", "X-cat", "RIDRETH3 ","BMI","Analysis Outocome", "Quant", "BMXBMI",
"Smoking", "Analysis Predictor", "Binary", "SMQ020")
variable_description |> kbl() |> kable_styling(bootstrap_options = "striped", full_width = F)| Variable_Name | Role_in_Analysis | Type | Original_Code |
|---|---|---|---|
| SEQN | ID | Quant | Respondent sequence number |
| RIDSTATR | Interview/Examination status | Binary | RIDSTATR |
| Gender | Analysis Key Predictor | Binary | RIAGENDR |
| Age | Analysis key predictor and used to select Adult persons aged 21-79 years | Quant | RIDAGEYR |
| Race/Ethnicity | Analysis Predictor | X-cat | RIDRETH3 |
| BMI | Analysis Outocome | Quant | BMXBMI |
| Smoking | Analysis Predictor | Binary | SMQ020 |
3.1. Data Description
analysis2Data <- readRDS('data/Analyses2Data.rds')
analysis2Data%>% select(Age, Gender, Race, Smoking, BMI) %>% describe() %>% kable()%>% kable_styling(bootstrap_options = "striped", full_width = F)| vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Age | 1 | 7732 | 49.250647 | 15.9952871 | 50.0 | 49.321694 | 19.27380 | 21.0 | 79.0 | 58.0 | -0.0602027 | -1.1321615 | 0.1819056 |
| Gender* | 2 | 7732 | 1.517719 | 0.4997183 | 2.0 | 1.522147 | 0.00000 | 1.0 | 2.0 | 1.0 | -0.0709051 | -1.9952305 | 0.0056830 |
| Race* | 3 | 7352 | 3.194913 | 1.1874789 | 3.0 | 3.243625 | 1.48260 | 1.0 | 5.0 | 4.0 | -0.3506428 | -0.6270779 | 0.0138492 |
| Smoking* | 4 | 7732 | 1.583937 | 0.4929361 | 2.0 | 1.604914 | 0.00000 | 1.0 | 2.0 | 1.0 | -0.3405148 | -1.8842933 | 0.0056059 |
| BMI | 5 | 7732 | 30.237739 | 7.6792048 | 28.9 | 29.474232 | 6.52344 | 14.6 | 92.3 | 77.7 | 1.3473715 | 3.8124043 | 0.0873314 |
The participants included in our analysis had a mean age of 49 and median age of 50, with mean Body Mass Index (BMI) of 30.24 and median BMI of 28.9, gender was categorized into female and males, race had 5 levels of Mexican American, Non-Hispanic White, Non-Hispanic black, Non-Hispanic Asian, and other Hispanics.
4. My Research Question
4.1. Problem Statement
Body Mass Index (BMI) is a widely used indicator for assessing body fat based on height and weight. Gender differences in BMI may arise due to biological, hormonal, and societal factors influencing body composition, metabolism, lifestyle behaviors among others. Understanding how BMI varies with gender within a population while taking into account other confounding factors such as age, race/ethnicity, and smoking status may provide us with insights on how this association may influence the gender based health outcomes within the United States population.
4.2. Research Question
Is there significant relationship between Body Mass Index (BMI) and gender, while adjusting for an individual’s age, race/ethnicity, and smoking status?
5. Partitioning the Data
Here, I divided my analysis data into training and testing data sets in a split ratio of 70% to 30%. This provides us with an opportunity to validate our model by prediction with the test dataset.
# Split analysis data into training (70%) and testing (30%)
set.seed(112724)
training_data <- complete_data %>% slice_sample(prop = 0.7)
testing_data <- anti_join(complete_data, training_data, by = "SEQN")
# Confirm split
nrow(training_data)[1] 5412
nrow(testing_data)[1] 2320
[1] TRUE
The data was split into the training data set with 5412 participants and a testing data set with 2320 participants.
6. Transforming the Outcome
Using the histogram and normal Q-Q plots, I assessed the normality and the data distribution of the outcome variable to provide an understanding of whether I needed to perfomr any data transformations.
# Histogram and Q-Q Plot
p1 <- ggplot(training_data, aes(x = BMI)) +
geom_histogram(bins = 50,binwidth = 3, fill = "blue", alpha = 0.7) +
labs(title = "BMI Distribution")+
theme_classic(base_size = 8) +
theme(legend.position = "left",
axis.text = element_text(face = "bold",size = 10),
axis.title.y = element_text(face = "bold", size = 10),
axis.title.x = element_text(face = "bold", size = 10),
plot.title = element_text(face = "bold", hjust = 0.5, size = 10))
p2 <- ggplot(training_data, aes(sample = BMI)) +
geom_qq(col = "slateblue") +
geom_qq_line(col = "magenta") +
theme(aspect.ratio = 1) +
labs(title = "Normal Q-Q: NHANES BMI")+
theme_classic(base_size = 8) +
theme(legend.position = "none",
axis.text = element_text(face = "bold",size = 10),
axis.title.y = element_text(face = "bold", size = 10),
axis.title.x = element_text(face = "bold", size = 10),
plot.title = element_text(face = "bold", hjust = 0.5, size = 10))
p1 + p2Both the histogram and the normal Q-Q plot, I noticed that the BMI distribution violated the normality assumption as seen from both plots that the data is right skewed and therefore a need to transform the data using perhaps log transform normalization.
# Histogram and Q-Q Plot for log transformed BMI
p1 <- ggplot(training_data, aes(x = log(BMI))) +
geom_histogram(bin = 50,binwidth = .08, fill = "blue", alpha = 0.7) +
labs(title = "log(BMI) Distribution")+
theme_classic(base_size = 8) +
theme(legend.position = "left",
axis.text = element_text(face = "bold",size = 10),
axis.title.y = element_text(face = "bold", size = 10),
axis.title.x = element_text(face = "bold", size = 10),
plot.title = element_text(face = "bold", hjust = 0.5, size = 10))
p2 <- ggplot(training_data, aes(sample = log(BMI))) +
geom_qq(col = "slateblue") +
geom_qq_line(col = "magenta") +
theme(aspect.ratio = 1) +
labs(title = "Normal Q-Q: NHANES log(BMI)")+
theme_classic(base_size = 8) +
theme(legend.position = "none",
axis.text = element_text(face = "bold",size = 10),
axis.title.y = element_text(face = "bold", size = 10),
axis.title.x = element_text(face = "bold", size = 10),
plot.title = element_text(face = "bold", hjust = 0.5, size = 10))
p1 + p2After the log transformation, the outcome variable more symmetrical and more most of the points lied on the line, with a little left skewness.
7. The Big Model
Using the training data, I fitted a multivariate linear regression model as a means of assessing the association between Gender and log transformed Body mass index and other confounding variables of age, race/ethnicity, and smoking status
# Big Model
training_data <- training_data%>% mutate(log_BMI = log(BMI))
big_model <- lm(log_BMI ~ Gender + Age + Race + Smoking, data = training_data)
summary(big_model)
Call:
lm(formula = log_BMI ~ Gender + Age + Race + Smoking, data = training_data)
Residuals:
Min 1Q Median 3Q Max
-0.69670 -0.15420 -0.01149 0.14289 1.09315
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.3520324 0.0138430 242.147 < 2e-16 ***
GenderFemale 0.0322878 0.0064219 5.028 5.12e-07 ***
Age 0.0006886 0.0001983 3.472 0.000521 ***
RaceHispanic -0.0240295 0.0128933 -1.864 0.062414 .
RaceWhite -0.0321550 0.0104481 -3.078 0.002097 **
RaceBlack 0.0153440 0.0105531 1.454 0.146012
RaceAsian -0.1619028 0.0123579 -13.101 < 2e-16 ***
SmokingNon_Smoker 0.0088150 0.0066868 1.318 0.187472
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2321 on 5404 degrees of freedom
Multiple R-squared: 0.05929, Adjusted R-squared: 0.05807
F-statistic: 48.66 on 7 and 5404 DF, p-value: < 2.2e-16
7.1. Big Model Summary Statistics
tidy(big_model)%>% kable(digits = 3)%>%kable_styling(bootstrap_options = "striped", full_width = F)| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 3.352 | 0.014 | 242.147 | 0.000 |
| GenderFemale | 0.032 | 0.006 | 5.028 | 0.000 |
| Age | 0.001 | 0.000 | 3.472 | 0.001 |
| RaceHispanic | -0.024 | 0.013 | -1.864 | 0.062 |
| RaceWhite | -0.032 | 0.010 | -3.078 | 0.002 |
| RaceBlack | 0.015 | 0.011 | 1.454 | 0.146 |
| RaceAsian | -0.162 | 0.012 | -13.101 | 0.000 |
| SmokingNon_Smoker | 0.009 | 0.007 | 1.318 | 0.187 |
From the Big model we observed that being female, age, and race: Hispanic, non-Hispanic white, non-Hispanic black, and non-Hispanic Asian were statistically significant predictors of an individuals BMI (p-value < 0.05).
7.2. Model Residuals
7.3. Diagnostic Plots
# Residuals vs Fitted Plot
p1<- ggplot(data = data.frame(fitted_values, residuals), aes(x = fitted_values, y = residuals)) +
geom_point(color = "blue", alpha = 0.6) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residuals vs Fitted",
x = "Fitted Values",
y = "Residuals") +
theme_classic(base_size = 8) +
theme(
axis.text = element_text(face = "bold",size = 9),
axis.title.y = element_text(face = "bold", size = 9),
axis.title.x = element_text(face = "bold", size = 9),
plot.title = element_text(face = "bold", hjust = 0.5, size = 10))
# Normal Q-Q Plot
p2 <- ggplot(data = data.frame(residuals), aes(sample = residuals)) +
stat_qq() +
stat_qq_line(color = "red") +
labs(title = "Normal Q-Q Plot",
x = "Theoretical Quantiles",
y = "Sample Quantiles") +
theme_classic(base_size = 8) +
theme(
axis.text = element_text(face = "bold",size = 9),
axis.title.y = element_text(face = "bold", size = 9),
axis.title.x = element_text(face = "bold", size = 9),
plot.title = element_text(face = "bold", hjust = 0.5, size = 10))
# Scale-Location Plot
sqrt_abs_residuals <- sqrt(abs(residuals))
p3 <- ggplot(data = data.frame(fitted_values, sqrt_abs_residuals), aes(x = fitted_values, y = sqrt_abs_residuals)) +
geom_point(color = "blue", alpha = 0.6) +
geom_smooth(method = "loess", color = "red", se = FALSE) +
labs(title = "Scale-Location Plot",
x = "Fitted Values",
y = "√|Residuals|") +
theme_classic(base_size = 8) +
theme(
axis.text = element_text(face = "bold",size = 9),
axis.title.y = element_text(face = "bold", size = 9),
axis.title.x = element_text(face = "bold", size = 9),
plot.title = element_text(face = "bold", hjust = 0.5, size = 10))
# Residuals vs Leverage Plot
leverage <- hatvalues(big_model)
p4 <- ggplot(data = data.frame(leverage, residuals), aes(x = leverage, y = residuals)) +
geom_point(color = "blue", alpha = 0.6) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residuals vs Leverage",
x = "Leverage",
y = "Residuals") +
theme_classic(base_size = 8) +
theme(
axis.text = element_text(face = "bold",size = 9),
axis.title.y = element_text(face = "bold", size = 9),
axis.title.x = element_text(face = "bold", size = 9),
plot.title = element_text(face = "bold", hjust = 0.5, size = 10))
p1 + p2 + p3 + p4From the residuals vs. Fitted plot, I observed that the residuals do not appear to be randomly scattered around the red dashed horizontal line at zero and there are clusters indicating non-linearity. The spread of residuals seemed to increase slightly for higher fitted values, suggesting heteroscedasticity. The normal Q-Q plot shows that residuals deviate from the red line, particularly in the top tail, indicating that residuals are not normally distributed, hence suggesting skewness. Additionally, from the Scale-location plot, the red trend line shows a slight upward slope, suggesting increasing variance of residuals with fitted values. This altogether points to violation of assumptions for homoscedasticity, linearity, normality of residuals.
8. The Smaller Model
The small model only interrogated the direct relationship between BMI and Gender without considering and confounding variables
# Smaller Model
small_model <- lm(log_BMI ~ Gender, data = training_data)
Summary_small_model <- summary(small_model)
Summary_small_model
Call:
lm(formula = log_BMI ~ Gender, data = training_data)
Residuals:
Min 1Q Median 3Q Max
-0.71090 -0.16508 -0.01434 0.14995 1.13312
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.361598 0.004709 713.848 < 2e-16 ***
GenderFemale 0.030325 0.006497 4.667 3.12e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2387 on 5410 degrees of freedom
Multiple R-squared: 0.004011, Adjusted R-squared: 0.003826
F-statistic: 21.78 on 1 and 5410 DF, p-value: 3.124e-06
Extract Summary stastics from the small model
tidy(small_model, conf.int = TRUE)%>% kable(digits = 3)%>% kable_styling(bootstrap_options = "striped", full_width = F)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 3.362 | 0.005 | 713.848 | 0 | 3.352 | 3.371 |
| GenderFemale | 0.030 | 0.006 | 4.667 | 0 | 0.018 | 0.043 |
9. In-Sample Comparison
Here, we assess model performance between the big-model and small-model using the training data set and assess the quality of model fitting by looking at key statistical measures such as R-Squared, Adjusted R-Squared, Akaike Information criterion (AIC) and Bayesian Information Criterion (BIC) and Log-Likelihood
9.1. Quality of Fit
big_model_summary <- glance(big_model)
small_model_summary <- glance(small_model)
fit_comparison <- data.frame(
Model = c("Big Model", "Small Model"),
R_squared = c(big_model_summary$r.squared, small_model_summary$r.squared),
Adjusted_R_squared = c(big_model_summary$adj.r.squared, small_model_summary$adj.r.squared),
AIC = c(big_model_summary$AIC, small_model_summary$AIC),
BIC = c(big_model_summary$BIC, small_model_summary$BIC)
)
fit_comparison%>% kable(digits = 3)%>%kable_styling(bootstrap_options = "striped", full_width = F)| Model | R_squared | Adjusted_R_squared | AIC | BIC |
|---|---|---|---|---|
| Big Model | 0.059 | 0.058 | -441.081 | -381.714 |
| Small Model | 0.004 | 0.004 | -144.057 | -124.268 |
Based on the Adjusted_R_squared, AIC, and BIC, the big_model showed a very good fit to the data given it showed the lowest most negative AIC, BIC values. This implies that together with other confounding factors, Gender better predicts an individual’s BMI.
9.2. Posterior Predictive Checks
training_data$Big_Model_Pred <- predict(big_model, training_data)
training_data$Small_Model_Pred <- predict(small_model, training_data)
ggplot(training_data, aes(x = log(BMI))) +
geom_density(aes(fill = "Observed"), alpha = 0.5) +
geom_density(aes(x = Big_Model_Pred, fill = "Big Model Prediction"), alpha = 0.5) +
geom_density(aes(x = Small_Model_Pred, fill = "Small Model Prediction"), alpha = 0.5) +
scale_fill_manual(values = c("Observed" = "blue",
"Big Model Prediction" = "red",
"Small Model Prediction" = "green")) +
labs(title = "Posterior Predictive Check: Observed vs Predicted BMI",
x = "log_BMI", y = "Density", fill = "Model") +
theme_classic(base_size = 8) +
theme(
axis.text = element_text(face = "bold",size = 10),
axis.title.y = element_text(face = "bold", size = 10),
axis.title.x = element_text(face = "bold", size = 10),
plot.title = element_text(face = "bold", hjust = 0.5, size = 12))From the plot, we observe that the Big Model’s predictions align more closely with the observed density curve compared to the Small model. This suggests that the big model captures the variability in the data more effectively by incorporating additional predictors. This particularly reduces prediction error and improves model accuracy.
9.3. Assessing Assumptions
To check whether the fitted models meets the linearity assumption, normality of residuals, homoscedasticity, and influential points, I extracted the residuals and their fitted values to plot the ‘Residuals vs. Fitted Plot’, normality Q-Q plot, Scale-location plot, and the residuals vs. leverage plot as shown below.
9.3.1. Residuals vs. Fitted plot
# Residuals for Big and Small Models
Residual_Big <- augment(big_model)
Residual_Small <- augment(small_model)
# Residual plots
p1 <- ggplot(Residual_Big)+ geom_point(aes(x =.fitted, y =.resid),color = "red") +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(title = "Residuals vs. Predicted (Big Model)",
x = "Predicted BMI (Big Model)",
y = "Residuals") + theme_classic(base_size = 8) +
theme(
axis.text = element_text(face = "bold",size = 10),
axis.title.y = element_text(face = "bold", size = 10),
axis.title.x = element_text(face = "bold", size = 10),
plot.title = element_text(face = "bold", hjust = 0.5, size = 12))
p2 <- ggplot(Residual_Small) +
geom_point(aes(x = .fitted, y = .resid), color = "green") +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(title = "Residuals vs. Predicted (Small Model)",
x = "Predicted BMI (Small Model)",
y = "Residuals") + theme_classic(base_size = 8) +
theme(
axis.text = element_text(face = "bold",size = 10),
axis.title.y = element_text(face = "bold", size = 10),
axis.title.x = element_text(face = "bold", size = 10),
plot.title = element_text(face = "bold", hjust = 0.5, size = 12))
p1+p2From the residuals vs. predicted plots above, the Big model shows more evenly distributed residuals around the zero line, suggesting it. captures the variance in the data better and meets the homoscedasticity assumption, whereas on the other hand, the small model has residuals clustered at specific predicted values, indicating poor fit and limited variability in its predictions, reflecting the oversimplification of using only gender as a predictor.
9.3.2. Q-Q Plot
This plot assesses normality of residuals by comparing them to a thretical normal distribution.
# Big Model Q-Q Plot
p1 <- ggplot(Residual_Big, aes(sample = .resid)) +
stat_qq(color = "blue") +
stat_qq_line(color = "red") +
labs(title = "Q-Q Plot (Big Model)",
x = "Theoretical Quantiles",
y = "Sample Quantiles") + theme_classic(base_size = 8) +
theme(
axis.text = element_text(face = "bold",size = 10),
axis.title.y = element_text(face = "bold", size = 10),
axis.title.x = element_text(face = "bold", size = 10),
plot.title = element_text(face = "bold", hjust = 0.5, size = 12))
# Small Model Q-Q Plot
p2 <- ggplot(Residual_Small, aes(sample = .resid)) +
stat_qq(color = "green") +
stat_qq_line(color = "red") +
labs(title = "Q-Q Plot (Small Model)",
x = "Theoretical Quantiles",
y = "Sample Quantiles") + theme_classic(base_size = 8) +
theme(
axis.text = element_text(face = "bold",size = 10),
axis.title.y = element_text(face = "bold", size = 10),
axis.title.x = element_text(face = "bold", size = 10),
plot.title = element_text(face = "bold", hjust = 0.5, size = 12))
p1+p2From the q-Q plot, we observed that for the small Model, the residuals more closely to the theoretical quantile line,suggesting better adherence to the normality assumption, implying a better fit, whereas the big model show deviation from the theoretical quantile line, particularly at the extremes, suggesting a lack of normality in the residuals, highlighting the presence of potential outliers affecting the distribution.
9.3.3. Scale-Location Plot
Here, we plot the square root of the standardized residuals to assess constant variance
# Big Model Scale-Location Plot
#Residual_Big$Std_Residuals <- sqrt(abs(scale(big_model_residuals$Residuals)))
p1 <- ggplot(Residual_Big, aes(x = .fitted, y = .std.resid)) +
geom_point(color = "blue", alpha = 0.6) +
geom_smooth(method = "loess", color = "red", se = FALSE) +
labs(title = "Scale-Location Plot (Big Model)",
x = "Fitted Values",
y = "√|Standardized Residuals|") + theme_classic(base_size = 8) +
theme(
axis.text = element_text(face = "bold",size = 10),
axis.title.y = element_text(face = "bold", size = 10),
axis.title.x = element_text(face = "bold", size = 10),
plot.title = element_text(face = "bold", hjust = 0.5, size = 12))
# Small Model Scale-Location Plot
#small_model_residuals$Std_Residuals <- sqrt(abs(scale(small_model_residuals$Residuals)))
p2 <- ggplot(Residual_Small, aes(x = .fitted, y = .std.resid)) +
geom_point(color = "green", alpha = 0.6) +
geom_smooth(method = "loess", color = "red", se = FALSE) +
labs(title = "Scale-Location Plot (Small Model)",
x = "Fitted Values",
y = "√|Standardized Residuals|") + theme_classic(base_size = 8) +
theme(
axis.text = element_text(face = "bold",size = 10),
axis.title.y = element_text(face = "bold", size = 10),
axis.title.x = element_text(face = "bold", size = 10),
plot.title = element_text(face = "bold", hjust = 0.5, size = 12))
p1+p2The Scale-Laction plot for the big model(left) indicates heteroscedasticity, as the spread of residuals varies with fitted values, and the red trendline is not flat, suggesting non-constant variance. Conversely, the small model (right) shows a consistent spread of residuals around the red line, indicating more uniform variance but limited variability in fitted values. These results suggest that while the small model has a simpler structure, the Big Model may suffer from issues related to over-fitting.
9.3.4. Residuals vs. Leverage Plot
# Big Model Residuals vs Leverage
big_model_influence <- data.frame(
Leverage = hatvalues(big_model),
Residuals = resid(big_model),
Cooks_Distance = cooks.distance(big_model)
)
p1 <- ggplot(big_model_influence, aes(x = Leverage, y = Residuals, size = Cooks_Distance)) +
geom_point(alpha = 0.6, color = "blue") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
scale_size_continuous(name = "Cook's Distance") +
labs(title = "Residuals vs Leverage (Big Model)",
x = "Leverage",
y = "Residuals") + theme_classic(base_size = 8) +
theme(
axis.text = element_text(face = "bold",size = 10),
axis.title.y = element_text(face = "bold", size = 10),
axis.title.x = element_text(face = "bold", size = 10),
plot.title = element_text(face = "bold", hjust = 0.5, size = 12))
# Small Model Residuals vs Leverage
small_model_influence <- data.frame(
Leverage = hatvalues(small_model),
Residuals = resid(small_model),
Cooks_Distance = cooks.distance(small_model)
)
p2 <- ggplot(small_model_influence, aes(x = Leverage, y = Residuals, size = Cooks_Distance)) +
geom_point(alpha = 0.6, color = "green") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
scale_size_continuous(name = "Cook's Distance") +
labs(title = "Residuals vs Leverage (Small Model)",
x = "Leverage",
y = "Residuals") + theme_classic(base_size = 8) +
theme(
axis.text = element_text(face = "bold",size = 10),
axis.title.y = element_text(face = "bold", size = 10),
axis.title.x = element_text(face = "bold", size = 10),
plot.title = element_text(face = "bold", hjust = 0.5, size = 12))
p1+p2The Residuals vs Leverage plot for the Big Model shows a wide spread of residuals with varying leverage values, indicating potential influential poitns that could disproportionately impact the model’s fit. Cook’s Distance circles suggest the presence of points with moderate influence, though none appear to be extreme outliers. In contrast, the small model demonstrates a more clustered pattern of residuals and leverage, with minimal influential points, reflecting its simplicity and robustness against leverage but potentially lacking complexity to capture variability.
9.4. Comparing the Models
comparison <- data.frame(
Model = c("Big Model", "Small Model"),
R_squared = c(summary(big_model)$r.squared, summary(small_model)$r.squared),
Adjusted_R_squared = c(summary(big_model)$adj.r.squared, summary(small_model)$adj.r.squared),
AIC = c(AIC(big_model), AIC(small_model)),
BIC = c(BIC(big_model), BIC(small_model))
)
kable(comparison, digits = 3) %>%
kable_styling(bootstrap_options = "striped", full_width = F)| Model | R_squared | Adjusted_R_squared | AIC | BIC |
|---|---|---|---|---|
| Big Model | 0.059 | 0.058 | -441.081 | -381.714 |
| Small Model | 0.004 | 0.004 | -144.057 | -124.268 |
9.4.1. Comparing R-sqaured and Adjusted-R-squared between big and small model
p1 <- ggplot(comparison, aes(Model, R_squared, fill = Model)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Model Comparison: R-squared", x = "Model", y = "R-squared") + theme_classic(base_size = 8) +
theme(
axis.text = element_text(face = "bold",size = 10),
axis.title.y = element_text(face = "bold", size = 10),
axis.title.x = element_text(face = "bold", size = 10),
plot.title = element_text(face = "bold", hjust = 0.5, size = 12)) +
scale_fill_manual(values = c("Big Model" = "red", "Small Model" = "green"))
p2 <- ggplot(comparison, aes(Model, Adjusted_R_squared, fill = Model)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Model Comparison: Adjusted_R_squared", x = "Model", y = "Adjusted_R_squared") + theme_classic(base_size = 8) +
theme(
axis.text = element_text(face = "bold",size = 10),
axis.title.y = element_text(face = "bold", size = 10),
axis.title.x = element_text(face = "bold", size = 10),
plot.title = element_text(face = "bold", hjust = 0.5, size = 12)) +
scale_fill_manual(values = c("Big Model" = "red", "Small Model" = "green"))
p1 +p2Based on the Adjusted_R_squared, AIC, and BIC, the big_model showed a very good fit to the data given it showed the lowest most negative AIC, BIC values. This implies that together with other confounding factors, Gender better predicts an individual’s BMI.Hence big demonstrated better performance when compared to small model on the training data set.
10. Model Validation
10.1. Calculating Prediction Errors
| Model | RMSPE | MAPE | R2 |
|---|---|---|---|
| Big Model | 28.017 | 88.274 | 0.065 |
| Small Model | 28.020 | 88.260 | 0.009 |
10.2. Visualizing the Predictions
# Combine predictions and observed data
testing_data$Pred_Big <- test_predictions_big
testing_data$Pred_Small <- test_predictions_small
# Plot for Big Model
p1 <- ggplot(testing_data, aes(x = Pred_Big, y = BMI, color = "red")) +
geom_point() +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(title = "Predicted vs Observed: Big Model",
x = "Predicted BMI (Big Model)",
y = "Observed BMI") + theme_classic(base_size = 8) +
theme(
axis.text = element_text(face = "bold",size = 10),
axis.title.y = element_text(face = "bold", size = 10),
axis.title.x = element_text(face = "bold", size = 10),
plot.title = element_text(face = "bold", hjust = 0.5, size = 12))
# Plot for Small Model
p2 <- ggplot(testing_data, aes(x = Pred_Small, y = BMI)) +
geom_point(color = "green") +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(title = "Predicted vs Observed: Small Model",
x = "Predicted BMI (Small Model)",
y = "Observed BMI") + theme_classic(base_size = 8) +
theme(
axis.text = element_text(face = "bold",size = 10),
axis.title.y = element_text(face = "bold", size = 10),
axis.title.x = element_text(face = "bold", size = 10),
plot.title = element_text(face = "bold", hjust = 0.5, size = 12))
p1+p2The Predicted vs. Observed plots reveal that the Big Model has a wider spread of predicted values relative to the observed BMI, indicating that it captures some variability though not with high precision, as evidenced by the clustering at lower BMI values. The small model on the other hand showed a more constrained range of predicted values, tightly clustered around a narrow band hence failing to account for the variability in BMI. Although, the big model provided a slightly better predictions on the testing data, both models inadequately capture the variation in the data.
10.3. Summarizing the Errors
# residuals for both models
residuals_big <- testing_data$BMI - test_predictions_big
residuals_small <- testing_data$BMI - test_predictions_small
# Mean Absolute Error (MAE)
mae_big <- mean(abs(residuals_big))
mae_small <- mean(abs(residuals_small))
# Root Mean Squared Error (RMSE)
rmse_big <- sqrt(mean(residuals_big^2))
rmse_small <- sqrt(mean(residuals_small^2))
# R-squared for predictions
sst <- sum((testing_data$BMI - mean(testing_data$BMI))^2) # Total Sum of Squares
sse_big <- sum(residuals_big^2) # Sum of Squared Errors for big model
sse_small <- sum(residuals_small^2) # Sum of Squared Errors for small model
r_squared_big <- 1 - (sse_big / sst)
r_squared_small <- 1 - (sse_small / sst)
# Create a summary table
comparison_table <- data.frame(
Model = c("Big Model", "Small Model"),
MAE = c(mae_big, mae_small),
RMSE = c(rmse_big, rmse_small),
R_Squared = c(r_squared_big, r_squared_small)
)
comparison_table %>%kable(digits = 3) %>%
kable_styling(bootstrap_options = "striped", full_width = F)| Model | MAE | RMSE | R_Squared |
|---|---|---|---|
| Big Model | 26.999 | 28.017 | -12.959 |
| Small Model | 26.998 | 28.020 | -12.962 |
We observed that both models exhibited nearly indentical predictive performance, as reflected by their RMSPE values (28.017 vs. 28.020) and MAE values (26.999 vs. 26.99) and MAPE (88.274 vs. 88.260), indicating that the additional predictors in the Big do not significantly reduce prediction error. Whereas the big model shows a slightly higher \(R^2\) (0.064 vs. 0.009), this improvement is minimal, and both models explain very little variance in the BMI. Conclusively, the results suggest that neither model effectively captures the variability in the outcome, with limited gains from including additional predictors in the big model.
10.4. Comparing the Models
Both the Big Model and Small Model exhibit similar RMSPE and MAPE values, indicating that the additional predictors in the Big Model (e.g., age, race, smoking status) do not significantly enhance prediction accuracy compared to the simpler Small Model. While the Big Model explains slightly more variance in BMI (\(𝑅^2\)=0.064) than the Small Model (\(𝑅^2\)=0.009), the low \(𝑅^2\) values for both models suggest limited explanatory power and potential unmeasured confounding variables. The Big Model’s inclusion of multiple predictors introduces complexity without meaningful gains in performance, hence resulting in potential overfitting. On the other hand, the Small Model’s simplicity and comparable prediction errors make it a more practical and computationally efficient choice. # 11. Discussion
11.1. Chosen Model
Based on these results, the Small Model is preferred, though further refinement of the Big Model with more relevant predictors may enhance its utility.
11.2. Answering My Question
The research question sought to determine whether gender is significantly associated with Body Mass Index (BMI), accounting for potential confounders such as age, race, and smoking status. The results from the Big Model, which included these additional predictors, showed a slightly higher \(R^2\)(0.064) compared to the Small Model (\(R^2\) =0.009). However, both models exhibited low explanatory power overall, suggesting that gender alone, or even in combination with the included covariates, does not account for much of the variability in BMI. These findings partially align with pre-analysis expectations, as the additional predictors were anticipated to improve explanatory power, but the improvement was minimal.
11.3. Next Steps
The low \(R^2\) values observed indicate that important predictors of BMI may be missing from the models. Factors such as dietary habits, physical activity, genetic predispositions, or socioeconomic status could contribute to the unexplained variance. Moreover, the observed similarity in prediction errors (RMSPE and MAPE) between the two models highlights potential limitations in the dataset or modeling approach. Future work should aim to include a broader range of predictors to enhance the models’ ability to explain BMI variability
11.4. Reflection
This analysis provided me with great insights into the relationship between BMI and gender, as well as the impact of other confounding predictors like age, race/ethnicity, and smoking status. Despite thee big model’s inclusion of multiple variables, its predictive performance was approximately identical to the simpler small model, highlighting the importance of parsimony in modeling. Low \(R^2\) values across both models suggested unmeasured factors influencing BMI, while diagnostic checks revealed violation of model assumptions, emphasizing the need for model refinement. Ultimately, this process underscored the importance of validation, thoughtful model selection, and a potential for future model improvements through expanded datasets. # 12. Session Information
R version 4.4.1 (2024-06-14)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.5
Locale: en_US.UTF-8 / en_US.UTF-8 / en_US.UTF-8 / C / en_US.UTF-8 / en_US.UTF-8
Package version:
abind_1.4-5 askpass_1.2.1 backports_1.5.0
base64enc_0.1.3 bit_4.0.5 bit64_4.0.5
blob_1.2.4 boot_1.3-30 broom_1.0.6
broom.mixed_0.2.9.6 bslib_0.8.0 cachem_1.1.0
callr_3.7.6 car_3.1-2 carData_3.0-5
cellranger_1.1.0 cli_3.6.3 clipr_0.8.0
cmprsk_2.2-12 coda_0.19.4.1 codetools_0.2-20
colorspace_2.1-1 compiler_4.4.1 conflicted_1.2.0
corrplot_0.94 cowplot_1.1.3 cpp11_0.4.7
crayon_1.5.3 curl_6.0.1 data.table_1.16.0
DBI_1.2.3 dbplyr_2.5.0 Deriv_4.1.3
digest_0.6.37 doBy_4.6.22 dplyr_1.1.4
dtplyr_1.3.1 Epi_2.53 etm_1.1.1
evaluate_0.24.0 fansi_1.0.6 farver_2.1.2
fastmap_1.2.0 fontawesome_0.5.2 forcats_1.0.0
foreach_1.5.2 foreign_0.8-86 fs_1.6.4
furrr_0.3.1 future_1.34.0 gargle_1.5.2
generics_0.1.3 ggplot2_3.5.1 ggpubr_0.6.0
ggrepel_0.9.5 ggsci_3.2.0 ggsignif_0.6.4
glmnet_4.1-8 globals_0.16.3 glue_1.8.0
googledrive_2.1.1 googlesheets4_1.1.1 GPArotation_2024.3.1
graphics_4.4.1 grDevices_4.4.1 grid_4.4.1
gridExtra_2.3 gtable_0.3.5 haven_2.5.4
highr_0.11 hms_1.1.3 htmltools_0.5.8.1
htmlwidgets_1.6.4 httr_1.4.7 ids_1.0.1
isoband_0.2.7 iterators_1.0.14 janitor_2.2.0
jomo_2.7-6 jquerylib_0.1.4 jsonlite_1.8.9
kableExtra_1.4.0 knitr_1.48 labeling_0.4.3
lattice_0.22-6 lifecycle_1.0.4 listenv_0.9.1
lme4_1.1-35.5 lmtest_0.9-40 lubridate_1.9.3
magrittr_2.0.3 MASS_7.3-60.2 Matrix_1.7-0
MatrixModels_0.5.3 memoise_2.0.1 methods_4.4.1
mgcv_1.9-1 mice_3.16.0 microbenchmark_1.4.10
mime_0.12 minqa_1.2.8 mitml_0.4-5
mnormt_2.1.1 modelr_0.1.11 munsell_0.5.1
naniar_1.1.0 nhanesA_1.2 nlme_3.1-164
nloptr_2.1.1 nnet_7.3-19 norm_1.0.11.1
numDeriv_2016.8-1.1 openssl_2.2.2 ordinal_2023.12.4.1
pan_1.9 parallel_4.4.1 parallelly_1.38.0
patchwork_1.2.0 pbkrtest_0.5.3 pillar_1.9.0
pkgconfig_2.0.3 plyr_1.8.9 polynom_1.4.1
prettyunits_1.2.0 processx_3.8.4 progress_1.2.3
ps_1.7.7 psych_2.4.6.26 purrr_1.0.2
quantreg_5.98 R6_2.5.1 ragg_1.3.2
rappdirs_0.3.3 RColorBrewer_1.1.3 Rcpp_1.0.13-1
RcppArmadillo_14.0.0.1 RcppEigen_0.3.4.0.1 readr_2.1.5
readxl_1.4.3 rematch_2.0.0 rematch2_2.1.2
reprex_2.1.1 rlang_1.1.4 rmarkdown_2.28
rpart_4.1.23 rstatix_0.7.2 rstudioapi_0.16.0
rvest_1.0.4 sass_0.4.9 scales_1.3.0
selectr_0.4.2 shape_1.4.6.1 snakecase_0.11.1
SparseM_1.84.2 splines_4.4.1 stats_4.4.1
stringi_1.8.4 stringr_1.5.1 survival_3.6-4
svglite_2.1.3 sys_3.4.3 systemfonts_1.1.0
textshaping_0.4.0 tibble_3.2.1 tidyr_1.3.1
tidyselect_1.2.1 tidyverse_2.0.0 timechange_0.3.0
tinytex_0.52 tools_4.4.1 tzdb_0.4.0
ucminf_1.2.2 UpSetR_1.4.0 utf8_1.2.4
utils_4.4.1 uuid_1.2.1 vcd_1.4-12
vctrs_0.6.5 viridis_0.6.5 viridisLite_0.4.2
visdat_0.6.0 vroom_1.6.5 withr_3.0.2
xfun_0.47 xml2_1.3.6 yaml_2.3.10
zoo_1.8-12